#Loading Nessesary libraries for sentiment Analysis

library(sentimentr)
library(qdap)
library(jsonlite)
library(dplyr)
library(ggplot2)
library(tm)
library(tidyverse)
library(SentimentAnalysis)
library(quanteda)
library(xlsx)

#Creating API varible for “Article Search API FORMAT”

NYTIMES_KEY="a0HA3uBISDkGyvUGR3FeoAGybtDVPPM5"

#creating the query nessesary to extract wanted information

#Professor Johnson still has to give me the list of words nessesary for query

#Doesn’t seem like we can do “and” “or” for the specific query, something that academic search complete and pro quest are able to do

#Just do it for one month 2012/01/01 - 2012/02/01

#The default connector for values in parentheses is OR.

# Let's set some parameters
#term <- "immigrant+immigration" # Need to use + to string together separate words however it is concatinating
begin_date <- "19800101"
end_date <- "20191201"#change to 2 months to get data faster
#glocations <- "U.S"

#Searching for the terms that we query above and essentially creating the query object in order to use it for the API

#the https needs to be underlined

#immigrant OR migrant OR migration OR refugee OR asylum OR undocumented"

#it needs to be fq not q

baseurl <- paste0("http://api.nytimes.com/svc/search/v2/articlesearch.json?fq=immigrant OR immigration OR migrant OR migration OR refugee OR asylum OR undocumented", 
                  "&begin_date=",begin_date,"&end_date=",end_date,
                  "&facet_filter=true&api-key=",NYTIMES_KEY, sep=" ")

#Returning a Json object and calculating the max pages from the query, there is a max of 10 objects (newspaper articles) per page

initialQuery <- fromJSON(baseurl)
maxPages <- round((initialQuery$response$meta$hits[1] / 10)-1) 

#from the maxpages we are pasting the baseurl and retriving the information and putting it in a dataframe

#Putting the system to sleep in order to ‘trick’ the computer for the amount of requests made per minute

#query from JAN 2012 - 2019 JAN took >24 minutes to run with sleep 30

#it doesn’t like it when using %>% data.frame() in the end of the third line

#require(reshape2)
pages <- list()
for(i in 0:maxPages){
  nytSearch <- fromJSON(paste0(baseurl, "&page=", i), flatten = TRUE) %>% data.frame() 
  message("Retrieving page ", i)
  pages[[i+1]] <- nytSearch 
  Sys.sleep(30) #change to 5 to test on smaller datasets, 30 for actual execution
}
Retrieving page 0
Retrieving page 1
Retrieving page 2
Retrieving page 3
Retrieving page 4
Retrieving page 5
Retrieving page 6
Retrieving page 7
Retrieving page 8
Retrieving page 9
Retrieving page 10
Retrieving page 11
Retrieving page 12
Retrieving page 13
Retrieving page 14
Retrieving page 15
Retrieving page 16
Retrieving page 17
Retrieving page 18
Retrieving page 19
Retrieving page 20
Retrieving page 21
Retrieving page 22
Retrieving page 23
Retrieving page 24
Retrieving page 25
Retrieving page 26
Retrieving page 27
Retrieving page 28
Retrieving page 29
Retrieving page 30
Retrieving page 31
Retrieving page 32
Retrieving page 33
Retrieving page 34
Retrieving page 35
Retrieving page 36
Retrieving page 37
Retrieving page 38
Retrieving page 39
Retrieving page 40
Retrieving page 41
Retrieving page 42
Retrieving page 43
Retrieving page 44
Retrieving page 45
Retrieving page 46
Retrieving page 47
Retrieving page 48
Retrieving page 49
Retrieving page 50
Retrieving page 51
Retrieving page 52
Retrieving page 53
Retrieving page 54
Retrieving page 55
Retrieving page 56
Retrieving page 57
Retrieving page 58
Retrieving page 59
Retrieving page 60
Retrieving page 61
Retrieving page 62
Retrieving page 63
Retrieving page 64
Retrieving page 65
Retrieving page 66
Retrieving page 67
Retrieving page 68
Retrieving page 69
Retrieving page 70
Retrieving page 71
Retrieving page 72
Retrieving page 73
Retrieving page 74
Retrieving page 75
Retrieving page 76
Retrieving page 77
Retrieving page 78
Retrieving page 79
Retrieving page 80
Retrieving page 81
Retrieving page 82
Retrieving page 83
Retrieving page 84
Retrieving page 85
Retrieving page 86
Retrieving page 87
Retrieving page 88
Retrieving page 89
Retrieving page 90
Retrieving page 91
Retrieving page 92
Retrieving page 93
Retrieving page 94
Retrieving page 95
Retrieving page 96
Retrieving page 97
Retrieving page 98
Retrieving page 99
Retrieving page 100
Retrieving page 101
Retrieving page 102
Retrieving page 103
Retrieving page 104
Retrieving page 105
Retrieving page 106
Retrieving page 107
Retrieving page 108
Retrieving page 109
Retrieving page 110
Retrieving page 111
Retrieving page 112
Retrieving page 113
Retrieving page 114
Retrieving page 115
Retrieving page 116
Retrieving page 117
Retrieving page 118
Retrieving page 119
Retrieving page 120
Retrieving page 121
Retrieving page 122
Retrieving page 123
Retrieving page 124
Retrieving page 125
Retrieving page 126
Retrieving page 127
Retrieving page 128
Retrieving page 129
Retrieving page 130
Retrieving page 131
Retrieving page 132
Retrieving page 133
Retrieving page 134
Retrieving page 135
Retrieving page 136
Retrieving page 137
Retrieving page 138
Retrieving page 139
Retrieving page 140
Retrieving page 141
Retrieving page 142
Retrieving page 143
Retrieving page 144
Retrieving page 145
#as.data.frame(nyt)

#binding the pages for the new result

allNYTSearch <-rbind_pages(pages)

#binding the pages together and creating the final dataframe.

allNYTSearch20122019 <- rbind_pages(pages)

#creating a dataframe that are only “news”: Function part

#getting rid of anything that could not be “news” so that it doesn’t interact with wrong data

funct_remove_rows<- function(dataFrame,col_dataframe){
not_news_variable <- c("Interactive Feature", "Review", "Letter", "Correction","List")
allNYTSearch_OnlyNews<<-dataFrame[!grepl(paste(not_news_variable, collapse="|"), col_dataframe),]
return(allNYTSearch_OnlyNews)
}

#calling the function on the specific column of allNYTSearch and removing Co-ed and editorials

funct_remove_rows(allNYTSearch,allNYTSearch$response.docs.type_of_material)

#getting rid off foreign fiels in news

funct_remove_notUS<- function(dataFrame,col_dataframe){
not_world_variable <- c("Europe","Middle East")
allNYTSearch_OnlyNews_inUS<<-dataFrame[!grepl(paste(not_world_variable, collapse="|"), col_dataframe),]
return(allNYTSearch_OnlyNews_inUS)
}

#calling the functiion on

funct_remove_notUS(allNYTSearch_OnlyNews,allNYTSearch_OnlyNews$response.docs.subsection_name)

#Getting rid of white spaces #verify that this is right

allNYTSearch_OnlyNews_inUS$response.docs.lead_paragraph <-(str_squish(allNYTSearch_OnlyNews_inUS$response.docs.lead_paragraph))

#Add Id variable to the NYTSearch_OnlyNews

allNYTSearch_OnlyNews$id <- seq.int(nrow(allNYTSearch_OnlyNews))

#Convert datarame into term document matrix from the allNYTSearch_onlyNews

myCorpus <- Corpus(VectorSource(allNYTSearch_OnlyNews_inUS$response.docs.lead_paragraph))
dtm_NYTArticles <-  DocumentTermMatrix(myCorpus, 
                                   control = 
                                     list(removePunctuation = TRUE,
                                          stopwords = TRUE,
                                          tolower = TRUE,
              
                                          removeNumbers = TRUE)) 

#tidy format sentiment analysis

library(dplyr)
library(tidytext)
package 㤼㸱tidytext㤼㸲 was built under R version 3.6.1
ap_td <- tidy(dtm_NYTArticles)

#Sentiment from the tdm created above with tidyverse

ap_sentiments <- ap_td %>%
  inner_join(get_sentiments("bing"), by = c(term = "word"))

#Sentiment from SentimentAnalysis package with LM dicctionary

sentiment_scoreLM_cleancode<- analyzeSentiment(dtm_NYTArticles,
                              rules=list("SentimentLM"=list(ruleSentiment, loadDictionaryLM())))

#Sentiment from SentimentAnalysis package with GI dicctionary

sentiment_scoreGI_cleancode<- analyzeSentiment(dtm_NYTArticles,
                              rules=list("SentimentGI"=list(ruleSentiment, loadDictionaryGI())))

#Sentiment from SentimentAnalysis package from qdap dicctionary

sentiment_scoreQDAP_cleancode<- (analyzeSentiment(dtm_NYTArticles)$SentimentQDAP)
sentiment_scoreQDAP_DF<-as.data.frame(sentiment_scoreQDAP_cleancode)

#putting the dicctionaries in dataframe

LM_positive<-list((str(DictionaryLM[["negative"]])))
 chr [1:2355] "abandon" "abandoned" "abandoning" "abandonment" "abandonments" "abandons" "abdicated" "abdicates" ...
LM_negative <-list((str(DictionaryLM[["positive"]])))
 chr [1:354] "able" "abundance" "abundant" "acclaimed" "accomplish" "accomplished" "accomplishes" "accomplishing" ...
LM_uncertantity <-list((str(DictionaryLM[["uncertainty"]])))
 chr [1:297] "abeyance" "abeyances" "almost" "alteration" "alterations" "ambiguities" "ambiguity" "ambiguous" ...
LM_dicctionary <- do.call(rbind, Map(data.frame, A=LM_positive, B=LM_negative, LM_uncertantity))

#combininig 3 sentiment analysis datasets

all_sentiment_DF <- cbind(sentiment_scoreLM_cleancode,sentiment_scoreGI_cleancode,sentiment_scoreQDAP_DF)

#plotting sentiments

library(Hmisc)
hist.data.frame(all_sentiment_DF)

WORD CLOUD

library("SnowballC")
library("wordcloud")
package 㤼㸱wordcloud㤼㸲 was built under R version 3.6.1
library("RColorBrewer")

#Only run this for word cloud purposes

dtm_NYTArticles_word_cloud <-  TermDocumentMatrix(myCorpus, 
                                   control = 
                                     list(removePunctuation = TRUE,
                                          stopwords = TRUE,
                                          tolower = TRUE,
              
                                          removeNumbers = TRUE)) 
m <- as.matrix(dtm_NYTArticles_word_cloud)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
N <- 1
newd<-d[-(1:N), , drop = FALSE]
set.seed(1234)
wordcloud(words = newd$word, freq = d$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

#convert R markdown to R script

#knitr::purl("Clean_Code_Sentimentr.Rmd", "Script_Clean_Code_Sentiment", documentation = 2)

#code for the word values of a certain column

aggregate(data.frame(count = allNYTSearch_OnlyNews_inUS$response.docs.subsection_name), list(value = allNYTSearch_OnlyNews_inUS$response.docs.subsection_name), length)

#saving data from 2019 to 2012 for sentiment analysis

allNYTSearchtosave <- allNYTSearch
allNYTSearchtosave$response.docs.multimedia <- NULL
allNYTSearchtosave$response.docs.keywords <- NULL
allNYTSearchtosave$response.docs.byline.person<- NULL
#write.xlsx(allNYTSearch, 'allNYTSearch2012-2019-1459hits.xlsx')
write.table(allNYTSearchtosave, "allNYTSearch20122019hits.txt", sep="\t")
---
title: "Clean Code SentimentR with API extraction of first paragraph"
output: html_notebook
---


#Loading Nessesary libraries for sentiment Analysis
```{r}
library(sentimentr)
library(qdap)
library(jsonlite)
library(dplyr)
library(ggplot2)
library(tm)
library(tidyverse)
library(SentimentAnalysis)
library(quanteda)
library(xlsx)
```

#Creating API varible for "Article Search API FORMAT"
```{r}
NYTIMES_KEY="a0HA3uBISDkGyvUGR3FeoAGybtDVPPM5"
```

#creating the query nessesary to extract wanted information

#Professor Johnson still has to give me the list of words nessesary for query 

#Doesn't seem like we can do "and" "or" for the specific query, something that academic search complete and pro quest are able to do

#Just do it for one month 2012/01/01 - 2012/02/01

#The default connector for values in parentheses is OR. 
```{r}
# Let's set some parameters
#term <- "immigrant+immigration" # Need to use + to string together separate words however it is concatinating
begin_date <- "19800101"
end_date <- "20191201"#change to 2 months to get data faster
#glocations <- "U.S"
```

#Searching for the terms that we query above and essentially creating the query object in order to use it for the API

#the https needs to be underlined

#immigrant OR migrant OR migration OR refugee OR asylum OR undocumented"

#it needs to be fq not q

```{r}
baseurl <- paste0("http://api.nytimes.com/svc/search/v2/articlesearch.json?fq=immigrant OR immigration OR migrant OR migration OR refugee OR asylum OR undocumented", 
                  "&begin_date=",begin_date,"&end_date=",end_date,
                  "&facet_filter=true&api-key=",NYTIMES_KEY, sep=" ")
```


#Returning a Json object and calculating the max pages from the query, there is a max of 10 objects (newspaper articles) per page
```{r}
initialQuery <- fromJSON(baseurl)
maxPages <- round((initialQuery$response$meta$hits[1] / 10)-1) 
```

#from the maxpages we are pasting the baseurl and retriving the information and putting it in a dataframe

#Putting the system to sleep in order to 'trick' the computer for the amount of requests made per minute

#query from JAN 2012 - 2019 JAN took >24 minutes to run with sleep 30

#it doesn't like it when using %>% data.frame() in the end of the third line
```{r}
#require(reshape2)
pages <- list()
for(i in 0:maxPages){
  nytSearch <- fromJSON(paste0(baseurl, "&page=", i), flatten = TRUE) %>% data.frame() 
  message("Retrieving page ", i)
  pages[[i+1]] <- nytSearch 
  Sys.sleep(15) #change to 5 to test on smaller datasets, 30 for actual execution
}
```
```{r}
#as.data.frame(nyt)
```


#binding the pages for the new result
```{r}
allNYTSearch <-rbind_pages(pages)
```


#binding the pages together and creating the final dataframe. 

```{r}
allNYTSearch20122019 <- rbind_pages(pages)
```

#creating a dataframe that are only "news": Function part

#getting rid of anything that could not be "news" so that it doesn't interact with wrong data
```{r}
funct_remove_rows<- function(dataFrame,col_dataframe){
not_news_variable <- c("Interactive Feature", "Review", "Letter", "Correction","List")
allNYTSearch_OnlyNews<<-dataFrame[!grepl(paste(not_news_variable, collapse="|"), col_dataframe),]
return(allNYTSearch_OnlyNews)
}
```


#calling the function on the specific column of allNYTSearch and removing Co-ed and editorials

```{r}
funct_remove_rows(allNYTSearch,allNYTSearch$response.docs.type_of_material)
```

#getting rid off foreign fiels in news
```{r}

funct_remove_notUS<- function(dataFrame,col_dataframe){
not_world_variable <- c("Europe","Middle East")
allNYTSearch_OnlyNews_inUS<<-dataFrame[!grepl(paste(not_world_variable, collapse="|"), col_dataframe),]
return(allNYTSearch_OnlyNews_inUS)
}

```


#calling the functiion on 

```{r}
funct_remove_notUS(allNYTSearch_OnlyNews,allNYTSearch_OnlyNews$response.docs.subsection_name)
```



#Getting rid of white spaces
#verify that this is right
```{r}
allNYTSearch_OnlyNews_inUS$response.docs.lead_paragraph <-(str_squish(allNYTSearch_OnlyNews_inUS$response.docs.lead_paragraph))
```

#Add Id variable to the NYTSearch_OnlyNews 
```{r}
allNYTSearch_OnlyNews$id <- seq.int(nrow(allNYTSearch_OnlyNews))
```



#Convert datarame into term document matrix from the allNYTSearch_onlyNews
```{r}
myCorpus <- Corpus(VectorSource(allNYTSearch_OnlyNews_inUS$response.docs.lead_paragraph))
dtm_NYTArticles <-  DocumentTermMatrix(myCorpus, 
                                   control = 
                                     list(removePunctuation = TRUE,
                                          stopwords = TRUE,
                                          tolower = TRUE,
              
                                          removeNumbers = TRUE)) 
```





#tidy format sentiment analysis
```{r}
library(dplyr)
library(tidytext)
ap_td <- tidy(dtm_NYTArticles)
```

#Sentiment from the tdm created above with tidyverse
```{r}
ap_sentiments <- ap_td %>%
  inner_join(get_sentiments("bing"), by = c(term = "word"))
```
#Sentiment from SentimentAnalysis package with LM dicctionary

```{r}
sentiment_scoreLM_cleancode<- analyzeSentiment(dtm_NYTArticles,
                              rules=list("SentimentLM"=list(ruleSentiment, loadDictionaryLM())))
```
#Sentiment from SentimentAnalysis package with GI dicctionary


```{r}
sentiment_scoreGI_cleancode<- analyzeSentiment(dtm_NYTArticles,
                              rules=list("SentimentGI"=list(ruleSentiment, loadDictionaryGI())))


```

#Sentiment from SentimentAnalysis package from qdap dicctionary

```{r}
sentiment_scoreQDAP_cleancode<- (analyzeSentiment(dtm_NYTArticles)$SentimentQDAP)
sentiment_scoreQDAP_DF<-as.data.frame(sentiment_scoreQDAP_cleancode)
```


#putting the dicctionaries in dataframe
```{r}

LM_positive<-list((str(DictionaryLM[["negative"]])))
LM_negative <-list((str(DictionaryLM[["positive"]])))
LM_uncertantity <-list((str(DictionaryLM[["uncertainty"]])))

LM_dicctionary <- do.call(rbind, Map(data.frame, A=LM_positive, B=LM_negative, LM_uncertantity))

```


#combininig 3 sentiment analysis datasets

```{r}
all_sentiment_DF <- cbind(sentiment_scoreLM_cleancode,sentiment_scoreGI_cleancode,sentiment_scoreQDAP_DF)
```

#plotting sentiments
```{r}
library(Hmisc)
hist.data.frame(all_sentiment_DF)
```




WORD CLOUD
```{r}
library("SnowballC")
library("wordcloud")
library("RColorBrewer")
```



#Only run this for word cloud purposes
```{r}
dtm_NYTArticles_word_cloud <-  TermDocumentMatrix(myCorpus, 
                                   control = 
                                     list(removePunctuation = TRUE,
                                          stopwords = TRUE,
                                          tolower = TRUE,
              
                                          removeNumbers = TRUE)) 
m <- as.matrix(dtm_NYTArticles_word_cloud)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
N <- 1

newd<-d[-(1:N), , drop = FALSE]
```

```{r}
set.seed(1234)
wordcloud(words = newd$word, freq = d$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))
```



#convert R markdown to R script
```{r}
#knitr::purl("Clean_Code_Sentimentr.Rmd", "Script_Clean_Code_Sentiment", documentation = 2)
```
#code for the word values of a certain column
```{r}
aggregate(data.frame(count = allNYTSearch_OnlyNews_inUS$response.docs.subsection_name), list(value = allNYTSearch_OnlyNews_inUS$response.docs.subsection_name), length)
```

#saving data from 2019 to 2012 for sentiment analysis
```{r}
allNYTSearchtosave <- allNYTSearch
allNYTSearchtosave$response.docs.multimedia <- NULL
allNYTSearchtosave$response.docs.keywords <- NULL
allNYTSearchtosave$response.docs.byline.person<- NULL
#write.xlsx(allNYTSearch, 'allNYTSearch2012-2019-1459hits.xlsx')
write.table(allNYTSearchtosave, "allNYTSearch20122019hits.txt", sep="\t")
```





